##### replicating figures 3-5 and 11 ####

rm(list = ls())
library(tidyverse)
library(igraph)
library(ggraph)
library(data.table)
library(DescTools)

cenl <- read_csv("nltrial_clean2.csv")

#### replicate (approximate) figure 11 in the appendix ####
table(cenl$cohort, useNA="always")

cenl$above <- NA
cenl$above[cenl$cohort=="80+"] <- 81
cenl$above[cenl$cohort=="71-80"] <- 71
cenl$above[cenl$cohort=="61-70"] <- 61
cenl$above[cenl$cohort=="51-60"] <- 51
cenl$above[cenl$cohort=="41-50"] <- 41
cenl$above[cenl$cohort=="31-40"] <- 31
cenl$above[cenl$cohort=="30-"] <- 20
table(cenl$above, useNA="always") 
hist(cenl$above, breaks =7, main="Histogram of China Watchers' estimated age", xlab="age")

### Change in gender composition over time - figure 3 ####
mydata <- cenl  %>% group_by(cohort, gender) %>% dplyr::summarise(n=n())

(ps_gender <- mydata %>% 
  filter(!is.na(cohort)) %>%
  mutate(gender=case_match(gender, "f" ~ "female", "m" ~ "male")) %>%
  mutate(gender=replace_na(gender, "unknown")) %>%
  ggplot(aes(fill=gender, y=n, x=cohort)) + 
  geom_bar(position="fill", stat="identity") +
  labs(x="cohort", y="", fill="") +
  scale_fill_grey() +
  theme_minimal()) +
  ggtitle("Composition of gender by age cohort")


##### Change in educational background composition over different cohorts #####

UGdata <- c("40-", "41-50", "51-60", "61-70", "71+") %>% as_tibble()
names(UGdata) <- "cohort"
UGdata$eduGINI <- NA
UGdata$level <- "UG"

UG1b <- cenl %>% filter(cohort=="80+" | cohort=="71-80") %>% group_by(UGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
UGdata$eduGINI[UGdata$cohort=="71+"] <- Gini(UG1b$expertsno[!is.na(UG1b$UGUni)])
UG3 <- cenl %>% filter(cohort=="61-70") %>% group_by(UGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
UGdata$eduGINI[UGdata$cohort=="61-70"] <- Gini(UG3$expertsno[!is.na(UG3$UGUni)])
UG4 <- cenl %>% filter(cohort=="51-60") %>% group_by(UGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
UGdata$eduGINI[UGdata$cohort=="51-60"] <- Gini(UG4$expertsno[!is.na(UG4$UGUni)])
UG5 <- cenl %>% filter(cohort=="41-50") %>% group_by(UGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
UGdata$eduGINI[UGdata$cohort=="41-50"] <- Gini(UG5$expertsno[!is.na(UG5$UGUni)])
UG7b <- cenl %>% filter(cohort=="30-" | cohort=="31-40") %>% group_by(UGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
UGdata$eduGINI[UGdata$cohort=="40-"] <- Gini(UG7b$expertsno[!is.na(UG7b$UGUni)])

# Same for PG studies
PGdata <- c("40-", "41-50", "51-60", "61-70", "71+") %>% as_tibble()
names(PGdata) <- "cohort"
PGdata$eduGINI <- NA
PGdata$level <- "PG"

PG1b <- cenl %>% filter(cohort=="80+" | cohort=="71-80") %>% group_by(PGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PGdata$eduGINI[PGdata$cohort=="71+"] <- Gini(PG1b$expertsno[!is.na(PG1b$PGUni)])
PG3 <- cenl %>% filter(cohort=="61-70") %>% group_by(PGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PGdata$eduGINI[PGdata$cohort=="61-70"] <- Gini(PG3$expertsno[!is.na(PG3$PGUni)])
PG4 <- cenl %>% filter(cohort=="51-60") %>% group_by(PGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PGdata$eduGINI[PGdata$cohort=="51-60"] <- Gini(PG4$expertsno[!is.na(PG4$PGUni)])
PG5 <- cenl %>% filter(cohort=="41-50") %>% group_by(PGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PGdata$eduGINI[PGdata$cohort=="41-50"] <- Gini(PG5$expertsno[!is.na(PG5$PGUni)])
PG7b <- cenl %>% filter(cohort=="30-" | cohort=="31-40") %>% group_by(PGUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PGdata$eduGINI[PGdata$cohort=="40-"] <- Gini(PG7b$expertsno[!is.na(PG7b$PGUni)])

# Same for PhD studies #
PhDdata <- c("40-", "41-50", "51-60", "61-70", "71+") %>% as_tibble()
names(PhDdata) <- "cohort"
PhDdata$eduGINI <- NA
PhDdata$level <- "PhD"

PhD1b <- cenl %>% filter(cohort=="80+" | cohort=="71-80") %>% group_by(PhDUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PhDdata$eduGINI[PhDdata$cohort=="71+"] <- Gini(PhD1b$expertsno[!is.na(PhD1b$PhDUni)])
PhD3 <- cenl %>% filter(cohort=="61-70") %>% group_by(PhDUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PhDdata$eduGINI[PhDdata$cohort=="61-70"] <- Gini(PhD3$expertsno[!is.na(PhD3$PhDUni)])
PhD4 <- cenl %>% filter(cohort=="51-60") %>% group_by(PhDUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PhDdata$eduGINI[PhDdata$cohort=="51-60"] <- Gini(PhD4$expertsno[!is.na(PhD4$PhDUni)])
PhD5 <- cenl %>% filter(cohort=="41-50") %>% group_by(PhDUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PhDdata$eduGINI[PhDdata$cohort=="41-50"] <- Gini(PhD5$expertsno[!is.na(PhD5$PhDUni)])
PhD7b <- cenl %>% filter(cohort=="30-" | cohort=="31-40") %>% group_by(PhDUni) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno)
PhDdata$eduGINI[PhDdata$cohort=="40-"] <- Gini(PhD7b$expertsno[!is.na(PhD7b$PhDUni)])

mydata <- rbind(UGdata, PGdata, PhDdata)
mydata$level <- as.factor(mydata$level)
mydata$color <- "grey"
mydata$color[as.numeric(mydata$level)==1] <- "darkgrey"
mydata$color[as.numeric(mydata$level)==3] <- "lightgrey"

(ps_gini <- mydata %>% 
    filter(cohort %in% c("40-", "41-50", "51-60", "61-70", "71+")) %>%
  ggplot(aes(x=cohort, y=eduGINI, group=color, color=level)) +
  geom_line(linewidth=2)+
  geom_point(size=3)+
  scale_color_manual(values=c("lightgrey", "grey", "black"))+
  theme_minimal() +
    labs(x="cohort", y="GINI coefficient", color="", group="")) +
  ggtitle("GINI coefficient of different China Watcher cohorts'\ndiversity in educational background")


#### Chinese origin China Watchers in different cohorts - figure 4 ####

cenl$UGChina <- "non-China"
cenl$UGChina[is.na(cenl$UGUni)] <- NA
table(cenl$UGChina, useNA="always")

MainlandUni <- c("Anhui Normal University", "Anhui University", "Beijing Agricultural University", "Beijing Broadcasting Institute",
                 "Beijing foreign languages university", "Beijing Institute of Foreign Languages (now Beijing Foreign Studies University)", 
                 "Beijing International Studies University", "Beijing International Studies University (BISU)", "Beijing Normal University",
                 "Capital Normal University (Beijing, China)", "Central-South University of Technology", "Central China Normal University", 
                 "Changchun University of Science and Technology", "Chengdu", "China Foreign Affairs University", "China Youth College for Economics",
                 "Chinese Culture University", "East China University of Politics and Law in Shanghai", "East China University of Science and Technology",
                 "Fudan University", "Fujian Normal University", "Heilongjiang University", "Henan University", "http://niis.cssn.cn/zjxz/zlyjy/201208/t20120828_1988576.shtml",
                 "Huazhong University", "Huazhong University of Science and Technology", "Hubei University", "Inner Mongolia Agricultural University",
                 "Lanzhou University", "Luoyang Foreign Languages University", "Nanjing Normal University", "Nanjing University", "Nankai University",
                 "Nanyang Technological University", "North China University of Science and Technology", "Peking University", "Peking University, China.", 
                 "Peking University, Waseda University", "Remin University", "Renmin University", "Shandong Normal University", "Shandong University", 
                 "Shanghai Institute for Foreign Trade", "Shanghai International Studies University", "Shanghai Jiao Tong University", "Shanghai University of Technology",
                 "Sichuan University", "Southwest Normal University", "Southwest University of Political Science & Law", "Tongji University", 
                 "Tsinghua University", "University of International Business and Economics (Beijing)", "Wuhan University", "Xi'an Jiaotong Univeristy", 
                 "Xi'an Jiaotong University", "Xiamen University", "Xian Foreign Languages Institute", "Xiamen University, China", "Xinyang Normal University 信阳师范大学", "Zhejiang University",
                 "Zhengzhou University", "北京大学", "北京师范大学", "北京师范大学文学院", "南京大学", "吉林大学", "国际关系学院", "河南师范大学", 
                 "清华大学", "湘潭大学管理学院", "西北大学", "黑龙江省齐齐哈尔师范学院", " China Agricultural University", "Chinese Academy of Sciences",
                 "Chinese Academy of Social Sciences", "Northeast Normal University", "Party School of the Central Committee of the Communist Party of China",
                 "Peking University, Waseda Univeristy", "Renmin University and Emory University", "中国现代国际关系研究院", "北京大学国际关系学院", 
                 "复旦大学国际关系与公共事务学院", "1. Universidade Federal de Pernambuco (UFPE) 2. Fudan University, School of International Relations and Public Affairs (China)", 
                 "1. Yenching Academy, Peking University (China). 2. Schwarzman Scholars, Tsinghua University (China).", "Beijing Foregin Studies University",
                 "Beijing Foreign Studies University", "Beijing University of Science and Technology", "Berlin School of Economics and Law and the Southwestern University of Finance and Economics Chengdu",
                 "Changsha Institute of Technology, Yale University", "China Agricultural University", "China University of Political Science and Law", "Chinese Academy of Science",
                 "East China Normal University", "Hunan Normal University", "Party School of the Central Committee of C. P. C. (National Academy of Governance)",
                 "Peking University/  University of Missouri/ University of California", "Peking University/ University of Idaho", "People's University of China",
                 "PLA Academy of Military Science; School of Oriental and African Studies", "Renmin University of China", "Renmin University, Columbia",
                 "University of Colorado-Boulder, Renmin University of China", "University of International Relations, Beijing", "University of Oslo and Zhejiang University",
                 "University of Pennsylvania/ PKU", "中共中央党校获法学", "中国社会科学院研究生院", "西北大学/ 牛津大学")


HKUni <- c("Chinese University of Hong Kong", "Hong Kong University", "Hong Kong University of Science and Technology", "City University of Hong Kong", 
           "The University of Hong Kong", "University of Hong Kong", "香港中文大学", "Hong Kong Baptist University", "The University of Hong Kong")
TaiwanUni <- c("Feng Chia University", "Fu Jen Catholic University", "Fu Jen University", "Ming Chuan University", "National Cheng Kung University", 
               "National Cheng Kung University", "National Chiao Tung University, Taiwan", "National Taichung University of Education", "National Taipei University", 
               "National Taiwan University", "NCCU", "Soochow University", "Sun Yat-sen University", "Tamkang University", "Tatung University",
               "Tunghai University", "中興大學", "台湾辅仁大学", "東京大學", "National Chengchi University", "National Chengchi University/ Sheffield University",
               "National Taiwan Normal University", "National Taiwan University & 紐約州立大學水牛城分校政治學博士班修業", "台湾政治大学", "政治大學", 
               "National Chengchi University; University of Rochester", "National Sun Yat Sen University", "National Tsinghua University", "NCCU, Columbia",
               "Sciences Po / NCCU", "Taiwan Chengchi University", "中山大學", "中興大學")

cenl$UGChina[cenl$UGUni %in% c(MainlandUni)] <- "Mainland"
cenl$UGChina[cenl$UGUni %in% c(HKUni)] <- "HK"
cenl$UGChina[cenl$UGUni %in% c(TaiwanUni)] <- "Taiwan"

UGChina <- c("40-", "41-50", "51-60", "61-70", "71+") %>% as_tibble()
names(UGChina) <- "cohort"
UGChina$percent <- NA
UGChina$level <- "UG"

UG1b <- cenl %>% filter(cohort=="80+" | cohort=="71-80") %>% group_by(UGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(UGChina))
UGChina$percent[UGChina$cohort=="71+"] <- UG1b$expertsno[UG1b$UGChina=="Mainland"]/sum(UG1b$expertsno)
UG3 <- cenl %>% filter(cohort=="61-70") %>% group_by(UGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(UGChina))
UGChina$percent[UGChina$cohort=="61-70"] <- UG3$expertsno[UG3$UGChina=="Mainland"]/sum(UG3$expertsno)
UG4 <- cenl %>% filter(cohort=="51-60") %>% group_by(UGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(UGChina))
UGChina$percent[UGChina$cohort=="51-60"] <- UG4$expertsno[UG4$UGChina=="Mainland"]/sum(UG4$expertsno)
UG5 <- cenl %>% filter(cohort=="41-50") %>% group_by(UGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(UGChina))
UGChina$percent[UGChina$cohort=="41-50"] <- UG5$expertsno[UG5$UGChina=="Mainland"]/sum(UG5$expertsno)
UG7b <- cenl %>% filter(cohort=="31-40" | cohort=="30-") %>% group_by(UGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(UGChina))
UGChina$percent[UGChina$cohort=="40-"] <- UG7b$expertsno[UG7b$UGChina=="Mainland"]/sum(UG7b$expertsno)

# Same for PG #
cenl$PGChina <- "non-China"
cenl$PGChina[is.na(cenl$PGUni)] <- NA
table(cenl$PGChina, useNA="always")

cenl$PGChina[cenl$PGUni %in% c(MainlandUni)] <- "Mainland"
cenl$PGChina[cenl$PGUni %in% c(HKUni)] <- "HK"
cenl$PGChina[cenl$PGUni %in% c(TaiwanUni)] <- "Taiwan"

table(cenl$PGUni[cenl$PGChina!="Mainland"])

PGChina <- c("40-", "41-50", "51-60", "61-70", "71+") %>% as_tibble()
names(PGChina) <- "cohort"
PGChina$percent <- NA
PGChina$level <- "PG"

PG1b <- cenl %>% filter(cohort=="80+" | cohort=="71-80") %>% group_by(PGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PGChina))
PGChina$percent[PGChina$cohort=="71+"] <- PG1b$expertsno[PG1b$PGChina=="Mainland"]/sum(PG1b$expertsno)
PG3 <- cenl %>% filter(cohort=="61-70") %>% group_by(PGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PGChina))
PGChina$percent[PGChina$cohort=="61-70"] <- PG3$expertsno[PG3$PGChina=="Mainland"]/sum(PG3$expertsno)
PG4 <- cenl %>% filter(cohort=="51-60") %>% group_by(PGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PGChina))
PGChina$percent[PGChina$cohort=="51-60"] <- PG4$expertsno[PG4$PGChina=="Mainland"]/sum(PG4$expertsno)
PG5 <- cenl %>% filter(cohort=="41-50") %>% group_by(PGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PGChina))
PGChina$percent[PGChina$cohort=="41-50"] <- PG5$expertsno[PG5$PGChina=="Mainland"]/sum(PG5$expertsno)
PG7b <- cenl %>% filter(cohort=="31-40" | cohort=="30-") %>% group_by(PGChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PGChina))
PGChina$percent[PGChina$cohort=="40-"] <- PG7b$expertsno[PG7b$PGChina=="Mainland"]/sum(PG7b$expertsno)

cenl$PhDChina <- "non-China"
cenl$PhDChina[is.na(cenl$PhDUni)] <- NA
table(cenl$PhDChina, useNA="always")

cenl$PhDChina[cenl$PhDUni %in% c(MainlandUni)] <- "Mainland"
cenl$PhDChina[cenl$PhDUni %in% c(HKUni)] <- "HK"
cenl$PhDChina[cenl$PhDUni %in% c(TaiwanUni)] <- "Taiwan"

PhDChina <- c("40-", "41-50", "51-60", "61-70", "71+") %>% as_tibble()
names(PhDChina) <- "cohort"
PhDChina$percent <- NA
PhDChina$level <- "PhD"

PhD1b <- cenl %>% filter(cohort=="80+" | cohort=="71-80") %>% group_by(PhDChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PhDChina))
PhDChina$percent[PhDChina$cohort=="71+"] <- PhD1b$expertsno[PhD1b$PhDChina=="Mainland"]/sum(PhD1b$expertsno)
PhD3 <- cenl %>% filter(cohort=="61-70") %>% group_by(PhDChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PhDChina))
PhDChina$percent[PhDChina$cohort=="61-70"] <- PhD3$expertsno[PhD3$PhDChina=="Mainland"]/sum(PhD3$expertsno)
PhD4 <- cenl %>% filter(cohort=="51-60") %>% group_by(PhDChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PhDChina))
PhDChina$percent[PhDChina$cohort=="51-60"] <- PhD4$expertsno[PhD4$PhDChina=="Mainland"]/sum(PhD4$expertsno)
PhD5 <- cenl %>% filter(cohort=="41-50") %>% group_by(PhDChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PhDChina))
PhDChina$percent[PhDChina$cohort=="41-50"] <- PhD5$expertsno[PhD5$PhDChina=="Mainland"]/sum(PhD5$expertsno)
PhD7b <- cenl %>% filter(cohort=="31-40" | cohort=="30-") %>% group_by(PhDChina) %>% dplyr::summarise(expertsno=n()) %>% arrange(-expertsno) %>% filter(!is.na(PhDChina))
PhDChina$percent[PhDChina$cohort=="40-"] <- PhD7b$expertsno[PhD7b$PhDChina=="Mainland"]/sum(PhD7b$expertsno)

mydata <- rbind(UGChina, PGChina, PhDChina)
mydata$level <- as.factor(mydata$level)
mydata$color <- "grey"
mydata$color[as.numeric(mydata$level)==1] <- "lightgrey"
mydata$color[as.numeric(mydata$level)==2] <- "darkgrey"
mydata$color[as.numeric(mydata$level)==3] <- "black"

(ps_Chinadegree <- mydata %>% 
    filter(cohort %in% c("40-", "41-50", "51-60", "61-70", "71+")) %>%
    ggplot(aes(x=cohort, y=percent, group=color, color=level)) +
    geom_line(linewidth=2)+
    geom_point(size=3)+
    scale_color_manual(values=c("lightgrey", "grey", "black"))+
    theme_minimal() +
    labs(x="cohort", y="% of China Watchers with degrees from China", color="", group="")) +
  ggtitle("Percentage of China Watchers in each cohort having\nreceived their degree in China")
